{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
   },
   "outputs": [],
   "source": [
    "# This Python 3 environment comes with many helpful analytics libraries installed\n",
    "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
    "# For example, here's several helpful packages to load\n",
    "\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "\n",
    "# Input data files are available in the read-only \"../input/\" directory\n",
    "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
    "\n",
    "import os\n",
    "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
    "    for filename in filenames:\n",
    "        print(os.path.join(dirname, filename))\n",
    "\n",
    "# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
    "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session\n",
    "\n",
    "from matplotlib import pyplot as plt\n",
    "import seaborn as sns\n",
    "import gzip\n",
    "\n",
    "test_file = 'test.gz'\n",
    "samplesubmision_file = 'sampleSubmission.gz'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processing Chunk No. 1\n",
      "Processing Chunk No. 2\n",
      "Processing Chunk No. 3\n",
      "Processing Chunk No. 4\n",
      "Processing Chunk No. 5\n",
      "Processing Chunk No. 6\n",
      "Processing Chunk No. 7\n",
      "Processing Chunk No. 8\n",
      "Processing Chunk No. 9\n",
      "Processing Chunk No. 10\n",
      "Processing Chunk No. 11\n",
      "Processing Chunk No. 12\n",
      "Processing Chunk No. 13\n"
     ]
    }
   ],
   "source": [
    "chunksize = 10 ** 6\n",
    "num_of_chunk = 0\n",
    "train = pd.DataFrame()\n",
    "    \n",
    "for chunk in pd.read_csv('train.csv', chunksize=chunksize):\n",
    "    num_of_chunk += 1\n",
    "    train = pd.concat([train, chunk.sample(frac=.05, replace=False, random_state=123)], axis=0)\n",
    "    print('Processing Chunk No. ' + str(num_of_chunk))     \n",
    "    \n",
    "train.reset_index(inplace=True)\n",
    "\n",
    "# Backup data train length, df for later use re-partitioned index\n",
    "train_len = len(train)\n",
    "train_len"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat([train, pd.read_csv(test_file, compression='gzip')]).drop(['index', 'id'], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a function to convert hour data into date format\n",
    "def get_date(hour):\n",
    "    y = '20'+str(hour)[:2]\n",
    "    m = str(hour)[2:4]\n",
    "    d = str(hour)[4:6]\n",
    "    return y+'-'+m+'-'+d\n",
    "\n",
    "# Create a weekday field and fill in the hour after conversion\n",
    "df['weekday'] = pd.to_datetime(df.hour.apply(get_date)).dt.dayofweek.astype(str)\n",
    "\n",
    "# Create a function to convert hour data into time period\n",
    "def tran_hour(x):\n",
    "    x = x % 100\n",
    "    while x in [23,0]:\n",
    "        return '23-01'\n",
    "    while x in [1,2]:\n",
    "        return '01-03'\n",
    "    while x in [3,4]:\n",
    "        return '03-05'\n",
    "    while x in [5,6]:\n",
    "        return '05-07'\n",
    "    while x in [7,8]:\n",
    "        return '07-09'\n",
    "    while x in [9,10]:\n",
    "        return '09-11'\n",
    "    while x in [11,12]:\n",
    "        return '11-13'\n",
    "    while x in [13,14]:\n",
    "        return '13-15'\n",
    "    while x in [15,16]:\n",
    "        return '15-17'\n",
    "    while x in [17,18]:\n",
    "        return '17-19'\n",
    "    while x in [19,20]:\n",
    "        return '19-21'\n",
    "    while x in [21,22]:\n",
    "        return '21-23'\n",
    "\n",
    "# Convert hour to time period\n",
    "df['hour'] = df.hour.apply(tran_hour)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Confirmation document type by\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After confirming the value count of each feature, it was found that the feature of type int, the feature with the most value is only 4,333 value counts, and a data set of 6 million data is obviously a non-continuous row variable. It can be concluded that all the features of this data set are variables of Object type."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len_of_feature_count = []\n",
    "for i in df.columns[2:23].tolist():\n",
    "    print(i, ':', len(df[i].astype(str).value_counts()))\n",
    "    len_of_feature_count.append(len(df[i].astype(str).value_counts()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a list and save the feature names of the lines that need to be converted into the list\n",
    "need_tran_feature = df.columns[2:4].tolist() + df.columns[13:23].tolist()\n",
    "\n",
    "# Convert variables to object type in sequence\n",
    "for i in need_tran_feature:\n",
    "    df[i] = df[i].astype(str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Some features have extremely high value counts, even millions of data values. In this case, one-hot encoding will undoubtedly cause dimensionality collapse. Here, the value count of each feature is limited to 10, and once the value exceeds 10, the reduction operation will be performed.\n",
    "\n",
    "The method of reduction is to calculate the click-through rate of all values of a variable, and the click-through rate is divided into very_high, higher, mid, lower, very_low, etc. 5 levels."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "obj_features = []\n",
    "\n",
    "for i in range(len(len_of_feature_count)):\n",
    "    if len_of_feature_count[i] > 10:\n",
    "        obj_features.append(df.columns[2:23].tolist()[i])\n",
    "obj_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_describe = df.describe()\n",
    "df_describe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def obj_clean(X):\n",
    "    # Define a function to reduce the data value, processing one feature vector at a time\n",
    "\n",
    "    def get_click_rate(x):\n",
    "        # Define a function to get the click rate\n",
    "        temp = train[train[X.columns[0]] == x]\n",
    "        res = round((temp.click.sum() / temp.click.count()),3)\n",
    "        return res\n",
    "\n",
    "    def get_type(V, str):\n",
    "        # Define a function to obtain the level distance judgment of the new data value\n",
    "        very_high = df_describe.loc['mean','click'] + 0.04\n",
    "        higher = df_describe.loc['mean','click'] + 0.02\n",
    "        lower = df_describe.loc['mean','click'] - 0.02\n",
    "        very_low = df_describe.loc['mean','click'] - 0.04\n",
    "\n",
    "        vh_type = V[V[str] > very_high].index.tolist()\n",
    "        hr_type = V[(V[str] > higher) & (V[str] < very_high)].index.tolist()\n",
    "        vl_type = V[V[str] < very_low].index.tolist()\n",
    "        lr_type = V[(V[str] < lower) & (V[str] > very_low)].index.tolist()\n",
    "\n",
    "        return vh_type, hr_type, vl_type, lr_type\n",
    "\n",
    "    def clean_function(x):\n",
    "        # Define a function to convert data values based on the level distance\n",
    "        # The basis for judgment is: the plus or minus 4% of the total average click-through rate is very_high(low), and the plus or minus 2% of the total average click-through rate is higher (lower)\n",
    "        while x in type_[0]:\n",
    "            return 'very_high'\n",
    "        while x in type_[1]:\n",
    "            return 'higher'\n",
    "        while x in type_[2]:\n",
    "            return 'very_low'\n",
    "        while x in type_[3]:\n",
    "            return 'lower'\n",
    "        return 'mid'\n",
    "        \n",
    "    print('Run: ', X.columns[0])\n",
    "    fq = X[X.columns[0]].value_counts()\n",
    "    # Create a temporary frequency list of data values\n",
    "    # Theoretically, all data values are classified and converted to get the best results; in practice, in order to perform time efficiency, the frequency of data values after the top 1000 rows will be dropped.\n",
    "    if len(fq) > 1000:\n",
    "        fq = fq[:1000]\n",
    "\n",
    "    # Convert the frequency list into a dataframe, and fill in the index in a new field.\n",
    "    fq = pd.DataFrame(fq)\n",
    "    fq['new_column'] = fq.index    \n",
    "\n",
    "    # Use the index to call the get_click_rate function to obtain the click rate of each data value\n",
    "    fq['click_rate'] = fq.new_column.apply(get_click_rate)\n",
    "\n",
    "   # Invoke the get_type function to obtain the classification distance, and save it as a list, so that it can be used by the next clean_function\n",
    "    type_ = get_type(fq, 'click_rate')\n",
    "\n",
    "   # Invoke clean_funtion funtion, return the converted feature vector\n",
    "    return X[X.columns[0]].apply(clean_function)\n",
    "\n",
    "# Use the for loop to input the features to be converted into the obj_clean function\n",
    "for i in obj_features:    \n",
    "    df[[i]] = obj_clean(df[[i]])\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Confirm the data value status of all features\n",
    "for i in df.columns:\n",
    "    sns.countplot(x = i, hue = \"click\", data = df)\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "According to the chart listed above, it is obvious that ['device_id','C14','C17','C19','C20','C21'] these features have only one value, which is not for the prediction model Meaning, so move these features out of the data set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop(['device_id', 'C14', 'C17', 'C19', 'C20', 'C21'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# One-hot encoding for all variables\n",
    "df = pd.get_dummies(df)\n",
    "\n",
    "# According to the processed df data table, split train and test again\n",
    "train = df[:train_len]\n",
    "test = df[train_len:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Export the processed train and test data sets to avoid lengthy processing time each time.\n",
    "\n",
    "# train.to_csv('new_train.csv', index=False)\n",
    "# test.to_csv('new_test.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Read the processed train, test data set, skip the lengthy re-execution processing time.\n",
    "# train = pd.read_csv('new_train.csv')\n",
    "# test = pd.read_csv('new_test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "del df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Because the data set is very large, and the proportion of positive labels only accounts for about 17% of all data, the proportion is obviously out of balance, and an algorithm with enhanced weighting function is required. Therefore, it is decided to use the xgboost algorithm to solve the problem of strengthening the weight, and at the same time use the GPU to effectively save the computing time.\n",
    "\n",
    "In order to save time in tuning parameters, before implementing the prediction model, 100 decision trees will be established, the best parameters and important features will be found by Grid Search, and finally the model will be established by the xgboost algorithm. In order to shorten the construction time of the decision tree, the negative label data will be sampled, and all the positive label data will be combined into a 50% data set to balance the weight problem. At the same time, because the ratio of positive and negative labels is balanced, ROC_AUC score will be used for parameter adjustment."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# From the train data set, from the data with the label 0, randomly sample as many as the data with the label 1, and combine them into a data set with 50% of the positive and negative labels.\n",
    "pre_X = train[train['click'] == 0].sample(n=len(train[train['click'] == 1]), random_state=111)\n",
    "pre_X = pd.concat([pre_X, train[train['click'] == 1]]).sample(frac=1)\n",
    "pre_y = pre_X[['click']]\n",
    "pre_X.drop(['click'], axis=1, inplace=True)\n",
    "test.drop(['click'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# Split the new data set into training set and validation set\n",
    "pre_X_train, pre_X_test, pre_y_train, pre_y_test = train_test_split(pre_X, pre_y, test_size=0.20, stratify=pre_y, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute Grid Search to adjust the parameters and create 100 trees to obtain the best parameters\n",
    "params = {\"criterion\":[\"gini\", \"entropy\"], \"max_depth\":range(1,20)}\n",
    "grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=params, scoring='roc_auc', cv=100, verbose=1, n_jobs=-1)\n",
    "grid_search.fit(pre_X_train, pre_y_train)\n",
    "grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a decision tree model based on the results of Grid Search and fit the complete data (pre-data)\n",
    "tree = grid_search.best_estimator_\n",
    "tree.fit(pre_X,pre_y)\n",
    "\n",
    "# Output important features and sort them according to their importance\n",
    "feature_importances = pd.DataFrame(tree.feature_importances_)\n",
    "feature_importances.index = pre_X_train.columns\n",
    "feature_importances = feature_importances.sort_values(0,ascending=False)\n",
    "feature_importances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Adjust the pre-work training set and validation set, reduce the feature importance to 1/3 of the importance ranking based on the feature importance\n",
    "pre_X_train = pre_X_train[feature_importances.index[:int(len(feature_importances)/3)]]\n",
    "pre_X_test = pre_X_test[feature_importances.index[:int(len(feature_importances)/3)]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use 33% of the important features to re-adjust the Grid Search parameters\n",
    "params = {\"criterion\":[\"gini\", \"entropy\"], \"max_depth\":range(1,12)}\n",
    "grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=params, scoring='roc_auc', cv=100, verbose=1, n_jobs=-1)\n",
    "grid_search.fit(pre_X_train, pre_y_train)\n",
    "grid_search.best_score_, grid_search.best_estimator_, grid_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Adjust the complete data set of the pre-work, and reduce the importance of features to 1/3 of the importance ranking\n",
    "pre_X = pre_X[feature_importances.index[:int(len(feature_importances)/3)]]\n",
    "\n",
    "# Create a decision tree model based on the results of Grid Search and fit the complete data (pre-data)\n",
    "tree = grid_search.best_estimator_\n",
    "tree.fit(pre_X,pre_y)\n",
    "\n",
    "# Output important features and sort them according to their importance\n",
    "feature_importances = pd.DataFrame(tree.feature_importances_)\n",
    "feature_importances.index = pre_X_train.columns\n",
    "feature_importances = feature_importances.sort_values(0,ascending=False)\n",
    "feature_importances"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The features of the final prediction model will use variables with feature values above .005\n",
    "feature_len = len(feature_importances[feature_importances[feature_importances.columns[0]] > 0.005])\n",
    "\n",
    "# Adjust the characteristics of the final complete Train Set and Test Set\n",
    "y = train[['click']]\n",
    "X = train[feature_importances[:feature_len].index]\n",
    "test = test[feature_importances[:feature_len].index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from xgboost import XGBClassifier\n",
    "\n",
    "# Use xgboost to model, and specify the node depth limit obtained from the previous tuning. Use xgboost to model and specify the node depth limit obtained from the previous tuning\n",
    "model = XGBClassifier(tree_method = 'gpu_hist', n_jobs=-1, n_estimators=500, max_depth=11)\n",
    "model.fit(X,y.values.ravel())\n",
    "y_pred = model.predict(X)\n",
    "print(\"Roc_auc_score: \",roc_auc_score(y,y_pred)*100,\"%\")\n",
    "\n",
    "# Draw the confusion matrix and view the prediction results\n",
    "confmat = confusion_matrix(y_true=y, y_pred=y_pred, labels=[0, 1])\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(2.5, 2.5))\n",
    "ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)\n",
    "for i in range(confmat.shape[0]):\n",
    "    for j in range(confmat.shape[1]):\n",
    "        ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')\n",
    "\n",
    "plt.xlabel('Predicted label')\n",
    "plt.ylabel('True label')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Export submission and submit\n",
    "submission = pd.read_csv(samplesubmision_file, compression='gzip', index_col='id')\n",
    "submission[submission.columns[0]] = model.predict_proba(test)[:,1]\n",
    "submission.to_csv('submission.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
